# -*- coding: utf-8 -*-
# ===========================================
# @Time    : 2021/10/8 10:19 
# @Author  : shutao
# @FileName: request_dxf_documentation.py
# @remark  : 
# 
# @Software: PyCharm
# Github 　： https://github.com/NameLacker
# ===========================================
import socket

from bs4 import BeautifulSoup
import requests
import random
import time
import os

# 反反爬虫设置
socket.setdefaulttimeout(20)  # 设置socket层的超时时间为20秒
user_agent = [
    'Mozilla/5.0 (Linux; U; Android 6.0.1; zh-cn; MI NOTE LTE Build/MMB29M) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/53.0.2785.146 Mobile Safari/537.36 XiaoMi/MiuiBrowser/8.8.7',
    'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Mobile Safari/537.36',
    'Mozilla/5.0 (iPhone; CPU iPhone OS 11_4_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.0 Mobile/15E148 Safari/604.1',
    'Mozilla/5.0 (iPhone; CPU iPhone OS 10_2_1 like Mac OS X) AppleWebKit/602.4.6 (KHTML, like Gecko) Version/10.0 Mobile/14D27 Safari/602.1',
    'Mozilla/5.0 (Linux; Android 5.1.1; Nexus 5 Build/LMY48B; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/43.0.2357.65 Mobile Safari/537.36',
    'Mozilla/5.0 (Linux; U; Android-4.0.3; en-us; Galaxy Nexus Build/IML74K) AppleWebKit/535.7 (KHTML, like Gecko) CrMo/16.0.912.75 Mobile Safari/535.7',
    'Mozilla/5.0 (Linux; U; Android-4.0.3; en-us; Xoom Build/IML77) AppleWebKit/535.7 (KHTML, like Gecko) CrMo/16.0.912.75 Safari/535.7',
    'Mozilla/5.0 (Linux;u;Android 4.2.2;zh-cn;) AppleWebKit/534.46 (KHTML,like Gecko) Version/5.1 Mobile Safari/10600.6.3',
    'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3 like Mac OS X) AppleWebKit/602.1.50 (KHTML, like Gecko) CriOS/56.0.2924.75 Mobile/14E5239e YisouSpider/5.0 Safari/602.1',
    'Mozilla/5.0 (Linux; Android 4.0; GT-I9300 Build/IMM76D) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/59.0.3071.92',
    'Mozilla/5.0 (Linux; Android 6.0.1; SOV33 Build/35.0.D.0.326) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.91 Mobile Safari/537.36',
    'Mozilla/5.0 (Linux; Android 6.0; HUAWEI MLA-AL10 Build/HUAWEIMLA-AL10) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Mobile Safari/537.36',
    'Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.96 Mobile Safari/537.36',
    'Mozilla/5.0 (Linux; Android 7.1.1; vivo X20A Build/NMF26X; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/62.0.3202.84 Mobile Safari/537.36 VivoBrowser/5.6.1.1',
    'Mozilla/5.0 (Linux; U; Android 6.0.1; zh-CN; SM-J7108 Build/MMB29K) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/57.0.2987.108 UCBrowser/11.9.7.977 Mobile Safari/537.36',
    'Mozilla/6.0 (Linux; Android 8.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.183 Mobile Safari/537.36'
]

# 保存根目录
root_path = "E:\\project_c++\\resolve_dxf\\resolve\\docs"

sections = [
    # AutoCAD 2011 dxf文件格式爬虫配置列表

    # HEADER 段爬虫配置
    [
        # HEADER 段组码
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-7a6f.htm",
            "filename": os.path.join(root_path, "data/headers/HEADER_code.txt"),
            "remark": "开始爬取 HEADERS 段组码配置..."
        },

        # 修订的 VPORT 标题变量
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-7a6d.htm",
            "filename": os.path.join(root_path, "data/headers/revise_VPORT_code.txt"),
            "remark": "开始爬取修订的 VPORT 标题变量段组码配置..."
        }
    ],

    # CLASSES 段爬虫配置
    [
        # CLASSES 段组码
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-7a60.htm",
            "filename": os.path.join(root_path, "data/classes/CLASSES_code.txt"),
            "remark": "开始爬取 CLASSES 段组码配置..."
        }
    ],

    # TABLES 段爬虫配置
    [
        # 通用符号表组码
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-7a5b.htm",
            "filename": os.path.join(root_path, "data/tables/universal_symbol_table_code.txt"),
            "remark": "开始爬取通用符号表组码配置..."
        },
        # 符号表条目的通用组码
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-7a59.htm",
            "filename": os.path.join(root_path, "data/tables/universal_group_code_for_symbol_table_entries.txt"),
            "remark": "开始爬取符号表条目的通用组码配置..."
        },
        # APPID 符号表条目
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-7a57.htm",
            "filename": os.path.join(root_path, "data/tables/APPID_code.txt"),
            "remark": "开始爬取 APPID 符号表条目组码配置..."
        },
        # BLOCK_RECORD 符号表条目
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-7a55.htm",
            "filename": os.path.join(root_path, "data/tables/BLOCK_RECORD_code.txt"),
            "remark": "开始爬取 BLOCK_RECORD 符号表条目组码配置..."
        },
        # DIMSTYLE 符号表条目
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-7a53.htm",
            "filename": os.path.join(root_path, "data/tables/DIMSTYLE_code.txt"),
            "remark": "开始爬取 DIMSTYLE 符号表条目组码配置..."
        },
        # LAYER 符号表条目
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-7a51.htm",
            "filename": os.path.join(root_path, "data/tables/LAYER_code.txt"),
            "remark": "开始爬取 LAYER 符号表条目组码配置..."
        },
        # LTYPE 符号表条目
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-7a4f.htm",
            "filename": os.path.join(root_path, "data/tables/LTYPE_code.txt"),
            "remark": "开始爬取 LTYPE 符号表条目组码配置..."
        },
        # STYLE 符号表条目
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-7a4d.htm",
            "filename": os.path.join(root_path, "data/tables/STYLE_code.txt"),
            "remark": "开始爬取 STYLE 符号表条目组码配置..."
        },
        # UCS 符号表条目
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-7a4b.htm",
            "filename": os.path.join(root_path, "data/tables/UCS_code.txt"),
            "remark": "开始爬取 UCS 符号表条目组码配置..."
        },
        # VIEW 符号表条目
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-7a49.htm",
            "filename": os.path.join(root_path, "data/tables/VIEW_code.txt"),
            "remark": "开始爬取 VIEW 符号表条目组码配置..."
        },
        # VPORT 符号表条目
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-7a46.htm",
            "filename": os.path.join(root_path, "data/tables/VPORT_code.txt"),
            "remark": "开始爬取 VPORT 符号表条目组码配置..."
        }
    ],

    # BLOCKS 段爬虫配置
    [
        # BLOCK 块图元组码
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-7a43.htm",
            "filename": os.path.join(root_path, "data/blocks/BLOCK_code.txt"),
            "remark": "开始爬取 BLOCK 块图元组码配置..."
        },
        # ENDBLK 对象组码
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-7a3f.htm",
            "filename": os.path.join(root_path, "data/blocks/ENDBLK_code.txt"),
            "remark": "开始爬取 ENDBLK 对象组码配置..."
        }
    ],

    # ENTITIES 段爬虫配置
    [
        # 通用图元组码
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-7a3d.htm",
            "filename": os.path.join(root_path, "data/entities/generic_entity_code.txt"),
            "remark": "开始爬取通用图元组码配置..."
        },
        # 3DFACE 图元组码
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-7a3b.htm",
            "filename": os.path.join(root_path, "data/entities/3DFACE_code.txt"),
            "remark": "开始爬取 3DFACE 图元组码配置..."
        },
        # 3DSOLID 图元组码
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-7a39.htm",
            "filename": os.path.join(root_path, "data/entities/3DSOLID_code.txt"),
            "remark": "开始爬取 3DSOLID 图元组码配置..."
        },
        # ACAD_PROXY_ENTITY 图元组码
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-7a37.htm",
            "filename": os.path.join(root_path, "data/entities/ACAD_PROXY_ENTITY_code.txt"),
            "remark": "开始爬取 ACAD_PROXY_ENTITY 图元组码配置..."
        },
        # ARC 图元组码
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-7a35.htm",
            "filename": os.path.join(root_path, "data/entities/ARC_code.txt"),
            "remark": "开始爬取 ARC 图元组码配置..."
        },
        # ATTDEF 图元组码
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-7a33.htm",
            "filename": os.path.join(root_path, "data/entities/ATTDEF_code.txt"),
            "remark": "开始爬取 ATTDEF 图元组码配置..."
        },
        # ATTRIB 图元组码
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-7a31.htm",
            "filename": os.path.join(root_path, "data/entities/ATTRIB_code.txt"),
            "remark": "开始爬取 ATTRIB 图元组码配置..."
        },
        # BODY 图元组码
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-7a2f.htm",
            "filename": os.path.join(root_path, "data/entities/BODY_code.txt"),
            "remark": "开始爬取 BODY 图元组码配置..."
        },
        # CIRCLE 图元组码
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-7a2d.htm",
            "filename": os.path.join(root_path, "data/entities/CIRCLE_code.txt"),
            "remark": "开始爬取 CIRCLE 图元组码配置..."
        },

        # DIMENSION
        # 通用标注组码
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-7a2a.htm",
            "filename": os.path.join(root_path, "data/entities/universal_labeling_group_code.txt"),
            "remark": "开始爬取通用标注组码图元组码配置..."
        },
        # 对齐标注组码
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-7a26.htm",
            "filename": os.path.join(root_path, "data/entities/align_label_group_code.txt"),
            "remark": "开始爬取对齐标注组码图元组码配置..."
        },
        # 线性标注和转角标注组码
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-7a23.htm",
            "filename": os.path.join(root_path, "data/entities/linear_and_corner_labeling_codes.txt"),
            "remark": "开始爬取线性标注和转角标注组码图元组码配置..."
        },
        # 半径标注和直径标注组码
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-7a21.htm",
            "filename": os.path.join(root_path, "data/entities/radius_and_diameter_labeling_codes.txt"),
            "remark": "开始爬取半径标注和直径标注组码图元组码配置..."
        },
        # 角度标注组码
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-7a1d.htm",
            "filename": os.path.join(root_path, "data/entities/angle_marking_group_code.txt"),
            "remark": "开始爬取角度标注组码图元组码配置..."
        },
        # 坐标标注组码
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-7a19.htm",
            "filename": os.path.join(root_path, "data/entities/coordinate_marking_group_code.txt"),
            "remark": "开始爬取坐标标注组码图元组码配置..."
        },

        # ELLIPSE 图元组码
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-7a15.htm",
            "filename": os.path.join(root_path, "data/entities/ELLIPSE_code.txt"),
            "remark": "开始爬取 ELLIPSE 图元组码配置..."
        },
        # HATCH 图元组码
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-7a13.htm",
            "filename": os.path.join(root_path, "data/entities/HATCH_code.txt"),
            "remark": "开始爬取 HATCH 图元组码配置..."
        },
        # HELIX 图元组码
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-7a08.htm",
            "filename": os.path.join(root_path, "data/entities/HELIX_code.txt"),
            "remark": "开始爬取 HELIX 图元组码配置..."
        },
        # IMAGE 图元组码
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-7a06.htm",
            "filename": os.path.join(root_path, "data/entities/IMAGE_code.txt"),
            "remark": "开始爬取 IMAGE 图元组码配置..."
        },
        # INSERT 图元组码
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-7a04.htm",
            "filename": os.path.join(root_path, "data/entities/INSERT_code.txt"),
            "remark": "开始爬取 INSERT 图元组码配置..."
        },
        # LEADER 图元组码
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-7a02.htm",
            "filename": os.path.join(root_path, "data/entities/LEADER_code.txt"),
            "remark": "开始爬取 LEADER 图元组码配置..."
        },
        # LIGHT 图元组码
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-7a00.htm",
            "filename": os.path.join(root_path, "data/entities/LIGHT_code.txt"),
            "remark": "开始爬取 LIGHT 图元组码配置..."
        },
        # LINE 图元组码
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-79fe.htm",
            "filename": os.path.join(root_path, "data/entities/LINE_code.txt"),
            "remark": "开始爬取 LINE 图元组码配置..."
        },
        # LWPOLYLINE 图元组码
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-79fc.htm",
            "filename": os.path.join(root_path, "data/entities/LWPOLYLINE_code.txt"),
            "remark": "开始爬取 LWPOLYLINE 图元组码配置..."
        },
        # MESH 图元组码
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff211a40be11dcc57291cfca.htm",
            "filename": os.path.join(root_path, "data/entities/MESH_code.txt"),
            "remark": "开始爬取 MESH 图元组码配置..."
        },
        # MLINE 图元组码
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-79fa.htm",
            "filename": os.path.join(root_path, "data/entities/MLINE_code.txt"),
            "remark": "开始爬取 MLINE 图元组码配置..."
        },
        # MLEADERSTYLE 图元组码
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ffc751fd10f4618e369-5bdf.htm",
            "filename": os.path.join(root_path, "data/entities/MLEADERSTYLE_code.txt"),
            "remark": "开始爬取 MLEADERSTYLE 图元组码配置..."
        },
        # MLEADER 图元组码
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ffc751fd10f4618e369-5bd9.htm",
            "filename": os.path.join(root_path, "data/entities/MLEADER_code.txt"),
            "remark": "开始爬取 MLEADER 图元组码配置..."
        },
        # MTEXT 图元组码
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-79f8.htm",
            "filename": os.path.join(root_path, "data/entities/MTEXT_code.txt"),
            "remark": "开始爬取 MTEXT 图元组码配置..."
        },
        # OLEFRAME 图元组码
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-79f6.htm",
            "filename": os.path.join(root_path, "data/entities/OLEFRAME_code.txt"),
            "remark": "开始爬取 OLEFRAME 图元组码配置..."
        },
        # OLE2FRAME 图元组码
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-79f4.htm",
            "filename": os.path.join(root_path, "data/entities/OLE2FRAME_code.txt"),
            "remark": "开始爬取 OLE2FRAME 图元组码配置..."
        },
        # POINT 图元组码
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-79f2.htm",
            "filename": os.path.join(root_path, "data/entities/POINT_code.txt"),
            "remark": "开始爬取 POINT 图元组码配置..."
        },
        # POLYLINE 图元组码
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-79f0.htm",
            "filename": os.path.join(root_path, "data/entities/POLYLINE_code.txt"),
            "remark": "开始爬取 POLYLINE 图元组码配置..."
        },
        # RAY 图元组码
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-79ed.htm",
            "filename": os.path.join(root_path, "data/entities/RAY_code.txt"),
            "remark": "开始爬取 RAY 图元组码配置..."
        },
        # REGION 图元组码
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-79eb.htm",
            "filename": os.path.join(root_path, "data/entities/REGION_code.txt"),
            "remark": "开始爬取 REGION 图元组码配置..."
        },
        # SECTION 图元组码
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-79e9.htm",
            "filename": os.path.join(root_path, "data/entities/SECTION_code.txt"),
            "remark": "开始爬取 SECTION 图元组码配置..."
        },
        # SEQEND 图元组码
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-79e7.htm",
            "filename": os.path.join(root_path, "data/entities/SEQEND_code.txt"),
            "remark": "开始爬取 SEQEND 图元组码配置..."
        },
        # SHAPE 图元组码
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-79e5.htm",
            "filename": os.path.join(root_path, "data/entities/SHAPE_code.txt"),
            "remark": "开始爬取 SHAPE 图元组码配置..."
        },
        # SOLID 图元组码
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-79e3.htm",
            "filename": os.path.join(root_path, "data/entities/SOLID_code.txt"),
            "remark": "开始爬取 SOLID 图元组码配置..."
        },
        # SPLINE 图元组码
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-79e1.htm",
            "filename": os.path.join(root_path, "data/entities/SPLINE_code.txt"),
            "remark": "开始爬取 SPLINE 图元组码配置..."
        },
        # SUN 图元组码
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-79df.htm",
            "filename": os.path.join(root_path, "data/entities/SUN_code.txt"),
            "remark": "开始爬取 SUN 图元组码配置..."
        },
        # SURFACE 图元组码
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-79dd.htm",
            "filename": os.path.join(root_path, "data/entities/SURFACE_code.txt"),
            "remark": "开始爬取 SURFACE 图元组码配置..."
        },
        # TABLE 图元组码
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-79d3.htm",
            "filename": os.path.join(root_path, "data/entities/TABLE_code.txt"),
            "remark": "开始爬取 TABLE 图元组码配置..."
        },
        # TEXT 图元组码
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-79d1.htm",
            "filename": os.path.join(root_path, "data/entities/TEXT_code.txt"),
            "remark": "开始爬取 TEXT 图元组码配置..."
        },
        # TOLERANCE 图元组码
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-79ce.htm",
            "filename": os.path.join(root_path, "data/entities/TOLERANCE_code.txt"),
            "remark": "开始爬取 TOLERANCE 图元组码配置..."
        },
        # TRACE 图元组码
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-79cc.htm",
            "filename": os.path.join(root_path, "data/entities/TRACE_code.txt"),
            "remark": "开始爬取 TRACE 图元组码配置..."
        },
        # UNDERLAY 图元组码
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-79ca.htm",
            "filename": os.path.join(root_path, "data/entities/UNDERLAY_code.txt"),
            "remark": "开始爬取 UNDERLAY 图元组码配置..."
        },
        # VERTEX 图元组码
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-79c8.htm",
            "filename": os.path.join(root_path, "data/entities/VERTEX_code.txt"),
            "remark": "开始爬取 VERTEX 图元组码配置..."
        },
        # VIEWPORT 图元组码
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-79c6.htm",
            "filename": os.path.join(root_path, "data/entities/VIEWPORT_code.txt"),
            "remark": "开始爬取 VIEWPORT 图元组码配置..."
        },
        # WIPEOUT 图元组码
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-79c4.htm",
            "filename": os.path.join(root_path, "data/entities/WIPEOUT_code.txt"),
            "remark": "开始爬取 WIPEOUT 图元组码配置..."
        },
        # XLINE 图元组码
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-79c2.htm",
            "filename": os.path.join(root_path, "data/entities/XLINE_code.txt"),
            "remark": "开始爬取 XLINE 图元组码配置..."
        }
    ],

    # OBJECTS 段爬虫配置
    [
        # 通用对象组码
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-79be.htm",
            "filename": os.path.join(root_path, "data/objects/generic_object_group_code.txt"),
            "remark": "开始爬取通用对象组码配置"
        },
        # ACAD_PROXY_OBJECT 对象
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-79bc.htm",
            "filename": os.path.join(root_path, "data/objects/ACAD_PROXY_OBJECT_code.txt"),
            "remark": "开始爬取 ACAD_PROXY_OBJECT 对象组码配置"
        },
        # ACDBDICTIONARYWDFLT 对象
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-79ba.htm",
            "filename": os.path.join(root_path, "data/objects/ACDBDICTIONARYWDFLT_code.txt"),
            "remark": "开始爬取 ACDBDICTIONARYWDFLT 对象组码配置"
        },
        # ACDBPLACEHOLDER 对象
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-79b8.htm",
            "filename": os.path.join(root_path, "data/objects/ACDBPLACEHOLDER_code.txt"),
            "remark": "开始爬取 ACDBPLACEHOLDER 对象组码配置"
        },
        # DATATABLE 对象
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-79b6.htm",
            "filename": os.path.join(root_path, "data/objects/DATATABLE_code.txt"),
            "remark": "开始爬取 DATATABLE 对象组码配置"
        },
        # DICTIONARY 对象
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-79b4.htm",
            "filename": os.path.join(root_path, "data/objects/DICTIONARY_code.txt"),
            "remark": "开始爬取 DICTIONARY 对象组码配置"
        },
        # DICTIONARYVAR 对象
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-79b2.htm",
            "filename": os.path.join(root_path, "data/objects/DICTIONARYVAR_code.txt"),
            "remark": "开始爬取 DICTIONARYVAR 对象组码配置"
        },
        # DIMASSOC 对象
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-79b0.htm",
            "filename": os.path.join(root_path, "data/objects/DIMASSOC_code.txt"),
            "remark": "开始爬取 DIMASSOC 对象组码配置"
        },
        # FIELD 对象
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-79ae.htm",
            "filename": os.path.join(root_path, "data/objects/FIELD_code.txt"),
            "remark": "开始爬取 FIELD 对象组码配置"
        },
        # GEODATA 对象
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS73099cc142f4875535a241551166ac8792f-963.htm",
            "filename": os.path.join(root_path, "data/objects/GEODATA_code.txt"),
            "remark": "开始爬取 GEODATA 对象组码配置"
        },
        # GROUP 对象
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-79ac.htm",
            "filename": os.path.join(root_path, "data/objects/GROUP_code.txt"),
            "remark": "开始爬取 GROUP 对象组码配置"
        },
        # IDBUFFER 对象
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-79aa.htm",
            "filename": os.path.join(root_path, "data/objects/IDBUFFER_code.txt"),
            "remark": "开始爬取 IDBUFFER 对象组码配置"
        },
        # IMAGEDEF 对象
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-79a8.htm",
            "filename": os.path.join(root_path, "data/objects/IMAGEDEF_code.txt"),
            "remark": "开始爬取 IMAGEDEF 对象组码配置"
        },
        # IMAGEDEF_REACTOR 对象
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-79a6.htm",
            "filename": os.path.join(root_path, "data/objects/IMAGEDEF_REACTOR_code.txt"),
            "remark": "开始爬取 IMAGEDEF_REACTOR 对象组码配置"
        },
        # LAYER_INDEX 对象
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-79a4.htm",
            "filename": os.path.join(root_path, "data/objects/LAYER_INDEX_code.txt"),
            "remark": "开始爬取 LAYER_INDEX 对象组码配置"
        },
        # LAYER_FILTER 对象
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-79a2.htm",
            "filename": os.path.join(root_path, "data/objects/LAYER_FILTER_code.txt"),
            "remark": "开始爬取 LAYER_FILTER 对象组码配置"
        },
        # LAYOUT 对象
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-79a0.htm",
            "filename": os.path.join(root_path, "data/objects/LAYOUT_code.txt"),
            "remark": "开始爬取 LAYOUT 对象组码配置"
        },
        # LIGHTLIST 对象
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-799e.htm",
            "filename": os.path.join(root_path, "data/objects/LIGHTLIST_code.txt"),
            "remark": "开始爬取 LIGHTLIST 对象组码配置"
        },
        # MATERIAL 对象
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-799c.htm",
            "filename": os.path.join(root_path, "data/objects/MATERIAL_code.txt"),
            "remark": "开始爬取 MATERIAL 对象组码配置"
        },
        # MLINESTYLE 对象
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-799a.htm",
            "filename": os.path.join(root_path, "data/objects/MLINESTYLE_code.txt"),
            "remark": "开始爬取 MLINESTYLE 对象组码配置"
        },
        # OBJECT_PTR 对象
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-7998.htm",
            "filename": os.path.join(root_path, "data/objects/OBJECT_PTR_code.txt"),
            "remark": "开始爬取 OBJECT_PTR 对象组码配置"
        },
        # PLOTSETTINGS 对象
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-7996.htm",
            "filename": os.path.join(root_path, "data/objects/PLOTSETTINGS_code.txt"),
            "remark": "开始爬取 PLOTSETTINGS 对象组码配置"
        },
        # RASTERVARIABLES 对象
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-7994.htm",
            "filename": os.path.join(root_path, "data/objects/RASTERVARIABLES_code.txt"),
            "remark": "开始爬取 RASTERVARIABLES 对象组码配置"
        },
        # RENDER 对象
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-7992.htm",
            "filename": os.path.join(root_path, "data/objects/RENDER_code.txt"),
            "remark": "开始爬取 RENDER 对象组码配置"
        },
        # SECTION 对象
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-798b.htm",
            "filename": os.path.join(root_path, "data/objects/SECTION_code.txt"),
            "remark": "开始爬取 SECTION 对象组码配置"
        },
        # SPATIAL_INDEX 对象
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-7982.htm",
            "filename": os.path.join(root_path, "data/objects/SPATIAL_INDEX_code.txt"),
            "remark": "开始爬取 SPATIAL_INDEX 对象组码配置"
        },
        # SPATIAL_FILTER 对象
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-7980.htm",
            "filename": os.path.join(root_path, "data/objects/SPATIAL_FILTER_code.txt"),
            "remark": "开始爬取 SPATIAL_FILTER 对象组码配置"
        },
        # SORTENTSTABLE 对象
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-797e.htm",
            "filename": os.path.join(root_path, "data/objects/SORTENTSTABLE_code.txt"),
            "remark": "开始爬取 SORTENTSTABLE 对象组码配置"
        },
        # TABLESTYLE 对象
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-797a.htm",
            "filename": os.path.join(root_path, "data/objects/TABLESTYLE_code.txt"),
            "remark": "开始爬取 TABLESTYLE 对象组码配置"
        },
        # UNDERLAYDEFINITION 对象
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-7978.htm",
            "filename": os.path.join(root_path, "data/objects/UNDERLAYDEFINITION_code.txt"),
            "remark": "开始爬取 UNDERLAYDEFINITION 对象组码配置"
        },
        # VISUALSTYLE 对象
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-7976.htm",
            "filename": os.path.join(root_path, "data/objects/VISUALSTYLE_code.txt"),
            "remark": "开始爬取 VISUALSTYLE 对象组码配置"
        },
        # VBA_PROJECT 对象
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-7974.htm",
            "filename": os.path.join(root_path, "data/objects/VBA_PROJECT_code.txt"),
            "remark": "开始爬取 VBA_PROJECT 对象组码配置"
        },
        # WIPEOUTVARIABLES 对象
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-7972.htm",
            "filename": os.path.join(root_path, "data/objects/WIPEOUTVARIABLES_code.txt"),
            "remark": "开始爬取 WIPEOUTVARIABLES 对象组码配置"
        },
        # XRECORD 对象
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-7970.htm",
            "filename": os.path.join(root_path, "data/objects/XRECORD_code.txt"),
            "remark": "开始爬取 XRECORD 对象组码配置"
        }
    ],

    # THUMBNAILIMAGE 段爬虫配置
    [
        # THUMBNAILIMAGE 段组码
        {
            "url": "http://docs.autodesk.com/ACD/2011/CHS/filesDXF/WS1a9193826455f5ff18cb41610ec0a2e719-796e.htm",
            "filename": os.path.join(root_path, "data/thumbnailimage/THUMBNAILIMAGE_code.txt"),
            "remark": "开始爬取 THUMBNAILIMAGE 段组码配置..."
        }
    ]
]
# 若data目录不存在，则创建之
paths = [os.path.join(root_path, "data"), os.path.join(root_path, "data/headers"),
         os.path.join(root_path, "data/classes"), os.path.join(root_path, "data/tables"),
         os.path.join(root_path, "data/blocks"), os.path.join(root_path, "data/entities"),
         os.path.join(root_path, "data/objects"), os.path.join(root_path, "data/thumbnailimage")]
for path in paths:
    if not os.path.exists(path):
        os.mkdir(path)


def para_context(context):
    """
    解析网页
    Args:
        context: html 字符串
    Returns:
        data: 返回可保存的数据数组
    """
    data = []
    bsobj = BeautifulSoup(context, 'lxml')
    tables = bsobj.find_all('tr')
    for idx, table in enumerate(tables):
        td_ths = table.find_all('td') if idx > 1 else table.find_all('th')
        line = []
        for td_th in td_ths:
            spans = []
            ps = td_th.find_all('p')
            for p in ps:
                spans.append(p.text.replace('\n', ''))
            line.append(spans)
        data.append(line)
    return data


def write_data(data, file_name):
    """
    输出文件每行的每个键值用 '|' 分隔，每个键值的不同行用 ',' 分隔
    Args:
        data: 待写入的内容
        file_name: 待写入的文件名
    Returns:

    """
    with open(file_name, 'w', encoding='utf-8') as f:
        for tr in data:
            line = ''
            for td in tr:
                for p in td:
                    line += p + ','
                line = line[:-1]
                line += '|'
            line = line[:-1]
            line += '\n'
            f.writelines(line)


def run_requests(url, file_name):
    """
    爬取主程序
    Args:
        url: 爬取的 url
        file_name: 保存的文件名
    Returns:

    """
    # 反反爬虫设置
    header = {
        "User-Agent": random.choice(user_agent),
        "Connection": "close"
    }
    response = requests.get(url, headers=header, timeout=8)
    context = response.content.decode('utf-8')
    # print(context)

    # 读取所需数据
    data = para_context(context)
    # 写入文件
    write_data(data, file_name)

    # 反反爬虫设置
    time.sleep(3)
    response.close()


def request_autocad():
    """
    爬取 CAD 的 dxf 文件配置数据
    """
    # 标记文件
    symbol_file = 'symbol.txt'
    index = 0
    start_index = 0
    if os.path.exists(symbol_file):
        # 提取当前爬取进程标记
        with open(symbol_file, 'r') as f:
            start_index = int(f.readline())

    for section in sections:
        for doc_dict in section:
            index += 1

            # 爬取前流程校验
            # 爬取进度
            if start_index > 0 and index < start_index:
                continue
            # 校验爬虫配置是否为空
            if doc_dict["url"] is None or doc_dict["filename"] is None:
                print("当前配置项为空！")
                continue
            # 若目标文件已存在，则跳过爬取
            if os.path.exists(doc_dict["filename"]):
                print("当前组码配置文件 `{}` 文件已存在，跳过爬取...".format(doc_dict["filename"]))
                continue

            try:
                # 爬虫主程序
                print(doc_dict["remark"])
                run_requests(doc_dict["url"], doc_dict["filename"])
                print("爬取成功，已保存到目标文件夹的`data/`目录下！")
            except Exception as e:
                print(e, '\n爬取失败，等候5s后再继续爬')
                time.sleep(5)
                try:
                    # 尝试第二次爬取数据
                    run_requests(doc_dict["url"], doc_dict["filename"])
                    print("第二次爬取成功，已保存到目标文件夹的`data/`目录下！")
                except Exception as e:
                    print(e, '\n尝试第二次爬取失败，停止爬虫并记录当前爬取的序号')
                    # 保存爬取进度并退出爬虫程序
                    with open(symbol_file, 'w') as f:
                        f.writelines(str(index))
                    exit(1)
            finally:
                print("-------------------------------")


if __name__ == '__main__':
    request_autocad()
